############################################
############################################
# previous: nothing this is the first step #
############################################

###### Summary ############################################################################################
#This script extracts the data from all source files and combines them into a single file with a cosnisten 
#format & columns. It then corrects any known problematic symbols contained in the file, missing values, 
#incorrect spellings or synonyms for species- i.e. cleans the file to a usable standard for further analyses.
###########################################################################################################

############################################
# next: 2-VAP_data_justspp #################
############################################

##############################################################################################
#convert GBIF files from tab delimited into comma delimed files before using this script

#load required packages:
library(R.utils)
library(raster)
library(maptools)
library(rgdal)
library(stringr)
library(sf)
library(rgeos)

#set project name:
projectName<-"WHOSnakes"

#define directories:
MyDir<-paste("/home/jc217070/Maxent",sep="")
PlotDir<-paste(MyDir,"/Maxent_Data/Plots/Plots_WHOSnakes_2020_09_01",sep="")
DataDir<-paste(MyDir,"/Maxent_Data",sep="")
SWDDir<-paste(MyDir, "/Maxent_SWDs/spp_swds",sep="")
LUDir<-paste(MyDir, "/Maxent_LUTables",sep="")

MyDir<-paste("C:/Users/pintora/Dropbox/temp/WHOSnakeData",sep="")
PlotDir<-paste(MyDir,"/1_Maxent_Data/Plots/Plots_WHOSnakes_2022_02_07",sep="")
DataDir<-paste(MyDir,"/1_Maxent_Data",sep="")
LUDir<-paste(MyDir, "/2_Maxent_LUTables",sep="")
SWDDir<-paste(MyDir, "/3_Maxent_SWDs/spp_swds",sep="")

#read species reference list:
SppSum<-read.table(file=file.path(paste(LUDir,"/",projectName,".csv",sep="")), header=TRUE, sep=",") #load data summary for all species
spp<-SppSum$gen_sp_subtax

Vet<-read.table(file=file.path(paste(LUDir, "/",projectName,"_Exclusions_Final.csv",sep="")), header=TRUE, sep=",") 
Vet$sp<-gsub(" ","_",Vet$sp)
Vet$shouldbe<-gsub(" ","_",Vet$shouldbe)

Vet2<-SppSum
spExcept<-Vet2$gen_sp_subtax[Vet2$spExcept==1]

#get file lists for (i) viperidae, (ii) Elapidae, and (iii) Colubridae, and others:

allfiles <-list.files(path=paste(DataDir,sep=""), pattern=paste(projectName), full.names=TRUE,recursive=FALSE)

GBIFfiles<-grep("GBIF_", allfiles, value=TRUE)
ALAfiles<-grep("ALA_", allfiles, value=TRUE)
INATfiles<-grep("INAT_", allfiles, value=TRUE)
HerpMapperfiles<-grep("HerpMapper_", allfiles, value=TRUE)
SIGHTfiles<-grep("SIGHT_", allfiles, value=TRUE)
LITfiles<-grep("LIT_", allfiles, value=TRUE)
VERTfiles<-grep("VERT_", allfiles, value=TRUE)
  
#create empty data frame to bind new rows to
alldata<-data.frame()
ExtraData<-data.frame()

memory.limit(size=500000)

for (file in allfiles){
  
  i<-which(allfiles==file)
  print(paste(Sys.time(),"processing file", i, file, sep=" "))
  
  rawrecs<-read.table(file, sep=",", header=TRUE)
  
  if(file %in% GBIFfiles){
    
    rawrecs<-rawrecs[rawrecs$datasetKey!='5e1d2d54-f5db-43ac-87c5-55a9f79ac718',] #exclude Benin dataset

    ID<-gsub(paste(DataDir,"/GBIF_",projectName,"_",sep=""),"",gsub(".csv","",file))
    
    redrecs<-rawrecs[,c("gbifID","kingdom","phylum","class","order","family","genus","species",
                        "infraspecificEpithet","taxonRank","scientificName","countryCode",
                        "decimalLatitude","decimalLongitude",
                        "coordinateUncertaintyInMeters","coordinatePrecision",
                        "eventDate","day","month","year",
                        "establishmentMeans","issue","locality")]

    
  }else{
    if(file %in% ALAfiles){
      
    ID<-gsub(paste(DataDir,"/ALA_",projectName,"_",sep=""),"",gsub(".csv","",file))
    spelem<-str_split_fixed(rawrecs$species, " ", 2)
    species<-ifelse(spelem[,2]=="",paste(rawrecs$genus,rawrecs$species),paste(rawrecs$species))
    issue<-rep("",length(rawrecs[,1]))

    redrecs<-cbind(rawrecs[,c("catalogNumber","kingdom","phylum","class","order","family","genus")],
                   species,
                   rawrecs[,c("infraspecificEpithet","taxonRank","scientificName","country",
                        "Latitude...ungeneralised","Longitude...ungeneralised",
                        "coordinateUncertaintyInMeters","coordinatePrecision",
                        "eventDate","day","month","year",
                        "establishmentMeans")],
                   issue,
                   rawrecs[,c("locality")])

    
    } else {
      if(file %in% INATfiles){
        ID<-gsub(paste(DataDir,"/INAT_",projectName,"_",sep=""),"",gsub(".csv","",file))
        
        sspelem<-str_split_fixed(rawrecs$taxon_subspecies_name, " ", 3)          #split START_DATE in 3 separate elements
        spelem<-str_split_fixed(rawrecs$taxon_species_name, " ", 2)
        filler<-rep("",length(rawrecs[,1]))
        str_date<-str_split_fixed(rawrecs$observed_on,"/",3)
        fulltaxon<-ifelse(rawrecs$taxon_subspecies_name!="",paste(rawrecs$taxon_subspecies_name),paste(rawrecs$taxon_species_name))
        
        redrecs<-cbind(rawrecs[,c("id","taxon_kingdom_name","taxon_phylum_name","taxon_class_name","taxon_order_name","taxon_family_name")],
                       spelem[,1],paste(spelem[,1],spelem[,2]),
                       sspelem[,3],filler,fulltaxon,
                       rawrecs[,c("place_guess",
                            "latitude","longitude",
                            "positional_accuracy","positional_accuracy",
                            "observed_on")],
                       str_date[,1],str_date[,2],str_date[,3],filler,
                       rawrecs[,c("num_identification_disagreements","place_guess")])

      } else {
        if(file %in% HerpMapperfiles){
          
          ID<-gsub(paste(DataDir,"/HerpMapper_",projectName,"_",sep=""),"",gsub(".csv","",file))
          
          sspelem<-str_split_fixed(rawrecs$Taxon, " ", 3)          #split START_DATE in 3 separate elements
          filler<-rep("",length(rawrecs[,1]))
          str_date<-str_split_fixed(rawrecs$Date,"/",3)
          fulltaxon<-ifelse(sspelem[,3]=="",paste(sspelem[,1],sspelem[,2]),paste(sspelem[,1],sspelem[,2],sspelem[,3]))
          
          redrecs<-cbind(rawrecs[,c("UUID")],
                         filler,filler,filler,filler,filler,sspelem[,1],paste(sspelem[,1],sspelem[,2]),sspelem[,3],filler,
                         fulltaxon,
                         rawrecs[,c("Country",
                                    "Latitude","Longitude",
                                    "Accuracy","Accuracy",
                                    "Date")],
                         str_date[,1],str_date[,2],str_date[,3],filler,filler,
                         paste(rawrecs[,c("Level.2")],", ",rawrecs[,c("Level.1")],", ",rawrecs[,c("Country")]))

        } else {
          if(file %in% SIGHTfiles){
            
            ID<-gsub(paste(DataDir,"/SIGHT_",projectName,"_",sep=""),"",gsub(".csv","",file))
            
            sspelem<-str_split_fixed(rawrecs$scientificName, " ", 3)          #split START_DATE in 3 separate elements
            filler<-rep("",length(rawrecs[,1]))
            str_date<-str_split_fixed(rawrecs$Date,"/",3)
            fulltaxon<-ifelse(sspelem[,3]=="",paste(sspelem[,1],sspelem[,2]),paste(sspelem[,1],sspelem[,2],sspelem[,3]))
            
            redrecs<-cbind(rawrecs[,c("GUID")],
                           filler,filler,filler,filler,filler,sspelem[,1],paste(sspelem[,1],sspelem[,2]),sspelem[,3],filler,
                           fulltaxon,filler,
                           rawrecs[,c("lat","long","accuracy","accuracy")],
                           filler,filler,filler,filler,
                           rawrecs[,c("health")],filler,rawrecs[,c("Site.name")])

                           
          } else {
            if(file %in% VERTfiles){
              
              ID<-gsub(paste(DataDir,"/VERT_",projectName,"_",sep=""),"",gsub(".csv","",file))
              
              redrecs<-cbind(rawrecs[,c("catalognumber","kingdom","phylum","class","order","family","genus","specificepithet",
                                  "infraspecificepithet","taxonrank","scientificname","country",
                                  "decimallatitude","decimallongitude",
                                  "coordinateuncertaintyinmeters","coordinateprecision",
                                  "eventdate","day","month","year",
                                  "establishmentmeans","locationremarks")],
                               paste(rawrecs[,c("locality")],", ",rawrecs[,c("municipality")],", ",
                                     rawrecs[,c("stateprovince")],", ",rawrecs[,c("county")],", ",
                                     rawrecs[,c("country")]))

            } else {
            if(file %in% LITfiles){
              
              ID<-rawrecs$Ref
              #ID<-gsub(paste(DataDir,"/LIT_",projectName,"_",sep=""),"",gsub(".csv","",file))
              
              spelem<-str_split_fixed(rawrecs$Species, " ", 3)          #split START_DATE in 3 separate elements
              filler<-rep("",length(rawrecs[,1]))
              fulltaxon<-ifelse(spelem[,3]=="",paste(spelem[,1],spelem[,2]),paste(spelem[,1],spelem[,2],spelem[,3]))
              
              redrecs<-cbind(rawrecs[,c("LITID")],
                             filler,filler,filler,filler,filler,spelem[,1],paste(spelem[,1],spelem[,2]),spelem[,3],filler,
                             fulltaxon,
                             rawrecs[,c("Country","Latitude","Longitude","Confidence","Confidence","Date","day","month","year")],
                             rawrecs[,c("Voucher","Status")],
                             paste(rawrecs[,c("Locality.detailed")],", ",
                                   rawrecs[,c("Locality.General")],", ",
                                   rawrecs[,c("Country")]))
              
            }
          }
        }
      }
    }
  }
  }
  
  print(paste(dim(redrecs)))
  print(paste(names(redrecs)))
  
  names(redrecs)<-c("OUN","kingdom","phylum","class","order","family","genus","species","subtaxon","taxonrank","scientificName",
                    "country","lat","long","uncertainty","precision","date","day","month","year","establishmentMeans","issue","locality")
  redrecs$sourcefile<-ifelse(file %in% GBIFfiles,paste("GBIF"),
                             ifelse(file %in% ALAfiles,paste("ALA"),
                                    ifelse(file %in% INATfiles,paste("INAT"),
                                           ifelse(file %in% HerpMapperfiles,paste("HerpMapper"),
                                                  ifelse(file %in% SIGHTfiles,paste("SIGHT"),
                                                         ifelse(file %in% VERTfiles,paste("VERT"),
                                                                ifelse(file %in% LITfiles,paste("LIT"),
                                                                       paste(NA))))))))
  
  redrecs$OUN<-gsub(" ","_",redrecs$OUN)
  
  redrecs$UN<-paste(rep(redrecs$sourcefile[1],length(redrecs[,1])),seq(1:length(redrecs[,1])),sep="")

  redrecs$UN<-paste(redrecs$UN,"_",ID,sep="")
  
  print(paste(dim(redrecs)))
  print(paste(names(redrecs)))
  
  #row bind vector to previous rows for other files
  alldata<-rbind(alldata,redrecs)
  
  for(name in names(alldata)){
    alldata[,name]<-as.character(alldata[,name])
  }
  
}

#alldata$locality<-NULL

write.table(x=alldata, file=paste(DataDir, "/CLEAN_DATA/",projectName,"_allRecs_prelim1.csv",sep=""),qmethod = "double",
            na = "NA", row.names = FALSE, col.names = TRUE, sep=",",quote=TRUE)

#alldata<-read.table(file=file.path(paste(DataDir, "/CLEAN_DATA/",projectName,"_allRecs_prelim1.csv",sep="")), header=TRUE, sep=",") #load data summary for all species






#############################
#correct data species names & known synonyms to right names and add mapping units for multi-species models
#############################

alldata$myTaxon<-"NA"
alldata$ModUnit<-"NA"

allsynonyms<-vector()

for (sp in spp){
  n<-which(spp==sp)
  spsynonyms<-str_split(SppSum$Data_Synonyms[n], ";;")          #split START_DATE in 3 separate elements
  spsynonyms<-c(spsynonyms[[1]], paste(gsub(" 01","",SppSum$WHOSpecies[n])), paste(SppSum$Accepted_Species[n]),gsub("_"," ",sp))
  spsynonyms<-spsynonyms[spsynonyms!="NA"]
  allsynonyms<-c(allsynonyms, spsynonyms)
  print(paste("synonyms for",sp))
  print(spsynonyms)
  
}

#create colum with corrected taxonomic name in full dataset before extracting data for this species
for (sp in spp){

  #if (sp %in% c("Trimeresurus_stejnegeri","Trimeresurus_gumprechti",
  #              "Trimeresurus_medoensis","Trimeresurus_truongsonensis",
  #              "Trimeresurus_vogeli", "Trimeresurus_yunnanensis")){
 
  n<-which(spp==sp)
  print(paste("correcting species synonyms for", sp))
  myModUnit<-SppSum$ModUnit[n]
  synonyms<-str_split(SppSum$Data_Synonyms[n], ";;")          #split START_DATE in 3 separate elements
  synonyms<-c(synonyms[[1]], paste(gsub(" 01","",SppSum$WHOSpecies[n])), paste(SppSum$Accepted_Species[n]),gsub("_"," ",sp))
  print(paste(synonyms))
  notsynonyms<-allsynonyms[!(allsynonyms %in% synonyms)]
  
  alldata$myTaxon<-ifelse((alldata$species %in% synonyms | alldata$scientificName %in% synonyms 
                          | paste(alldata$genus," ",alldata$species,sep="") %in% synonyms
                          | paste(alldata$genus," ",alldata$species," ",alldata$subtaxon, sep="") %in% synonyms)
                          & !(alldata$scientificName %in% notsynonyms),
                          paste(sp),paste(alldata$myTaxon))

  alldata$ModUnit<-ifelse(alldata$myTaxon==sp,paste(myModUnit),paste(alldata$ModUnit))
}#}






tail(alldata[alldata$myTaxon=="Trimeresurus_stejnegeri" & alldata$sourcefile=="LIT",])
alldata$myTaxon<- gsub(" ","_",alldata$myTaxon)

write.table(x=alldata, file=paste(DataDir, "/CLEAN_DATA/",projectName,"_allRecs_prelim2.csv",sep=""),qmethod = "double",
            na = "NA", row.names = FALSE, col.names = TRUE, sep=",",quote=TRUE)

#alldata<-read.table(file=file.path(paste(DataDir, "/CLEAN_DATA/",projectName,"_allRecs_prelim2.csv",sep="")), header=TRUE, sep=",") #load data summary for all species









ExtraData<-subset(alldata, is.na(alldata$lat)|is.na(alldata$long)|alldata$lat==""|alldata$long=="")
ExtraData<-subset(ExtraData, !is.na(ExtraData$locality) & ExtraData$locality!="")
ExtraData<-subset(ExtraData, ExtraData$myTaxon %in% spp & ExtraData$myTaxon!="")

head(ExtraData)

#write file of all the data that has no lat long but has a written locality desciption that I can georeference if necessary
write.table(x=ExtraData, file=paste(DataDir, "/CLEAN_DATA/",projectName,"_georef_manually.csv",sep=""),qmethod = "double",
            na = "NA", row.names = FALSE, col.names = TRUE, sep=",",quote=TRUE)

#ExtraData<-read.table(file=file.path(paste(DataDir, "/CLEAN_DATA/",projectName,"_georef_manually.csv",sep="")), header=TRUE, sep=",") #load data summary for all species













###############################

#quick check to see if there's anyting left over in the extracted data that I need (wrong names...) or something listed that's not included in the data
extras<-sort(unique(alldata$myTaxon) [!(unique(alldata$myTaxon) %in% spp)])
print(extras)
missing<-sort(spp[!(spp %in% unique(alldata$myTaxon))])
print(missing)
#add<-unique(sort(alldata$scientificName [!((alldata$myTaxon) %in% spp)]))
#print(add)

alldata$lat<-as.numeric(alldata$lat)
alldata$long<-as.numeric(alldata$long)
alldata$lat<-ifelse(alldata$lat>90|alldata$lat<(-90),NA,alldata$lat)
alldata$long<-ifelse(alldata$long>180|alldata$long<(-180),NA,alldata$long)

#define 1km grid equivalent records:
alldata$lattrunc1km<-trunc(as.numeric(alldata$lat)*100)/100                 #truncates lat to 2 decimal points without (!) rounding
alldata$longtrunc1km<-trunc(as.numeric(alldata$long)*100)/100               #as above, for longitude

#remove rows with missing data in crucial columns and fix other small problems:
alldata<-alldata[alldata$myTaxon!="",]
alldata$uncertainty<-ifelse(alldata$uncertainty==""|is.na(alldata$uncertainty),paste(alldata$precision),paste(alldata$uncertainty))
alldata$uncertainty<-trunc(as.numeric(alldata$uncertainty)/10)*10
alldata$precision<-NULL #no longer needed, summarized in uncertainty
alldata$day<-as.numeric(alldata$day)
alldata$month<-as.numeric(alldata$month)
alldata$year<-as.numeric(alldata$year)
alldata$lat<-ifelse(alldata$lat>90|alldata$lat<(-90)|alldata$lat==0,NA,alldata$lat)
alldata$long<-ifelse(alldata$long>180|alldata$long<(-180)|alldata$long==0,NA,alldata$long)

alldata$country<-ifelse(alldata$myTaxon=="Protobothrops_cornutus",paste("China"),paste(alldata$country))
alldata$country<-ifelse(alldata$myTaxon=="Gloydius_ussuriensis" ,paste(""),paste(alldata$country))

#substitue annoying characters
badchars <-c("¼","½","¿","'","¨","”","™","»","\\·","‚","¶","§","Ä","ª","ä","å",
             "¡","ï","Í","î","è","é","É","Œ","Û","Ü","±","‡","†","°","º","Ð","Š",
             "\\t","\\n","\u008d","\u0081","?-???","?",".")
goodchars<-c("ue","ue","","","","","tm","","","","","s","a","a","ae","a",
             "i","i","i","i","e","e","e","oe","u","ue","","t","t"," deg"," deg","d","s",
             " "," ","","","","","")

for (char in badchars){
  m<-which(badchars==char)
  #alldata$subtaxon<-gsub(char,goodchars[m],alldata$subtaxon,fixed=TRUE)
  #alldata$country<-gsub(char,goodchars[m],alldata$country,fixed=TRUE)
  #alldata$scientificName <-gsub(char,goodchars[m],alldata$scientificName,fixed=TRUE)
  #alldata$establishmentMeans<-gsub(char,goodchars[m],alldata$establishmentMeans,fixed=TRUE)
  }

#do for column 1 to 11, 19
for (col in c(1:11,19:22)){
  alldata[,col]<-ifelse(is.na(alldata[,col]) | alldata[,col]=="-",paste(""),paste(alldata[,col]))
  
} 

alldata<-subset(alldata, !is.na(alldata$lat))
alldata<-subset(alldata, alldata$lat!="")
alldata<-subset(alldata, !is.na(alldata$long))
alldata<-subset(alldata, alldata$long!="")

#change uncertainty for truncated lat/longs to 100km
alldata$uncertainty<-ifelse(alldata$long==trunc(alldata$long) & alldata$lat==trunc(alldata$lat) 
                            & alldata$uncertainty<100000,100000,alldata$uncertainty)

#change uncertainty for lat/longs with only one decimal point to 10km
alldata$uncertainty<-ifelse(alldata$long*10==trunc(alldata$long*10) & alldata$lat*10==trunc(alldata$lat*10) 
                            & alldata$uncertainty<10000,10000,alldata$uncertainty)

#change uncertainty for lat/longs with only two decimal points to 1km
alldata$uncertainty<-ifelse(alldata$long*100==trunc(alldata$long*100) & alldata$lat*100==trunc(alldata$lat*100) 
                            & alldata$uncertainty<1000,1000,alldata$uncertainty)

############ save raw background as all data with coordinates,        ####################
############ including non-medicacally-relevant species from base files ####################

bg<-alldata[,c("myTaxon","long","lat")]
colnames(bg)<-c("species","longitude","latitude")# change column names of record data frame
length(bg$species[bg$species=="NA"])
bg$species<-ifelse(is.na(bg$species),paste("other"), ifelse(bg$species=="NA",paste("other"),paste(bg$species)))

#write.table(x=bg, file=paste(MyDir, "/3_Maxent_SWDs/backgrounds/",projectName,"_bgSWD_raw.csv",sep=""),qmethod = "double",
#            na = "NA", row.names = FALSE, col.names = TRUE, sep=",",quote=TRUE)
write.table(x=bg, file=paste(MyDir, "/3_Maxent_SWDs/backgrounds/",projectName,"_bgSWD_raw.csv",sep=""),qmethod = "double",
            na = "NA", row.names = FALSE, col.names = TRUE, sep=",",quote=TRUE)

############ write final version of all data for all relevant species #####################

alldata<-subset(alldata, !is.na(alldata$myTaxon))
alldata<-subset(alldata, alldata$myTaxon!="")

write.table(x=alldata, file=paste(DataDir, "/CLEAN_DATA/",projectName,"_allRecs.csv",sep=""),qmethod = "double",
            na = "NA", row.names = FALSE, col.names = TRUE, sep=",",quote=TRUE)


###########################################################################################

alldata2<-read.table(file=file.path(paste(DataDir, "/CLEAN_DATA/",projectName,"_allRecs.csv",sep="")), header=TRUE, sep=",") #load data summary for all species



############################################
############################################
# next: 2-VAP_data_justspp #################
############################################

